{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "from sklearn.linear_model import LinearRegression" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def lm(x, y, data, intercept=True):\n", " \"\"\"Returns the coefficients from regressing y on x.\n", " \n", " Inputs:\n", " - x: a list containing the names of the x variables\n", " - y: the name of the y variable\n", " - data: a Pandas data frame (the names in x and y must be columns in this data frame)\n", " - intercept: boolean indicating whether or not to include an intercept term\n", " \n", " Outputs: A Pandas series with the estimated coefficients, indexed by the x variable names.\n", " \"\"\"\n", " \n", " # expand categorical variables into binary variables\n", " new_cols = []\n", " for col in x:\n", " # if it's a categorical, expand it using pd.get_dummies()\n", " if data[col].dtype == object:\n", " new_cols.append(pd.get_dummies(data[[col]], drop_first=True))\n", " # otherwise, just append the variable as is\n", " else:\n", " new_cols.append(data[[col]])\n", " X = pd.concat(new_cols, axis=1)\n", " \n", " print(np.linalg.cond(np.dot(X.T, X)))\n", " \n", " Y = data[y]\n", " \n", " if intercept:\n", " names = [\"Intercept\"] + list(X.columns)\n", " ones = pd.Series(1, index=data.index)\n", " X = pd.concat([ones, X], axis=1)\n", " else:\n", " names = list(X.columns)\n", " \n", " beta = np.linalg.solve(np.dot(X.T, X), np.dot(X.T, Y))\n", " \n", " return pd.Series(data=beta, index=names)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Some Data To Test Your Code" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "predictors = [\"symboling\", \"normalized-losses\", \"make\", \"fuel-type\",\n", " \"aspiration\", \"num-of-doors\", \"body-style\", \"drive-wheels\",\n", " \"engine-location\", \"wheel-base\", \"length\", \"width\",\n", " \"height\", \"curb-weight\", \"engine-type\", \"num-of-cylinders\",\n", " \"engine-size\", \"fuel-system\", \"bore\", \"stroke\",\n", " \"compression-ratio\", \"horsepower\", \"peak-rpm\", \"city-mpg\",\n", " \"highway-mpg\"]\n", "data = pd.read_csv(\"http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data\",\n", " header=None,\n", " names=predictors + [\"price\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following code strips out missing values (represented by \"?\" in this data set) and converts columns to numeric types before fitting linear regression to the data." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(205, 26)\n", "(159, 26)\n" ] } ], "source": [ "print(data.shape)\n", "\n", "for col in data.columns:\n", " if data[col].dtype == object:\n", " data = data[data[col] != \"?\"]\n", " try:\n", " data[col] = pd.to_numeric(data[col])\n", " except:\n", " pass\n", " \n", "print(data.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test 1: Quantitative Predictors Only\n", "\n", "Let's test out the `lm` function you just wrote on some quantitative predictors." ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "110.72172389\n" ] }, { "data": { "text/plain": [ "Intercept -131136.766862\n", "length 122.591338\n", "width 1997.837168\n", "height -178.613267\n", "dtype: float64" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lm([\"length\", \"width\", \"height\"], \"price\", data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check that your `lm` function produces the same results as scikit-learn." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(-131136.76686224531, array([ 122.59133841, 1997.83716768, -178.61326723]))" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = LinearRegression()\n", "model.fit(data[[\"length\", \"width\", \"height\"]], data[\"price\"])\n", "model.intercept_, model.coef_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test 2: Categorical Predictors\n", "\n", "Your `lm` function should also do the right thing for categorical variables automatically (i.e., it should expand categorical variables with $k$ levels into $k-1$ 0-1 variables automatically)." ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8.60303579656e+19\n" ] }, { "data": { "text/plain": [ "Intercept 101623.946789\n", "symboling -5.067821\n", "normalized-losses 5.577457\n", "make_bmw 359.610692\n", "make_chevrolet -4745.387730\n", "make_dodge -6209.191496\n", "make_honda -1582.712461\n", "make_jaguar 2430.797572\n", "make_mazda -4062.659445\n", "make_mercedes-benz 2548.338064\n", "make_mitsubishi -6327.595985\n", "make_nissan -3689.681752\n", "make_peugot 68443.249796\n", "make_plymouth -6024.951373\n", "make_porsche 4830.408327\n", "make_saab -404.038771\n", "make_subaru -58336.174774\n", "make_toyota -5869.407539\n", "make_volkswagen -4297.346425\n", "make_volvo -2871.342061\n", "fuel-type_gas -89180.187383\n", "aspiration_turbo 2171.490389\n", "num-of-doors_two -838.068778\n", "body-style_hardtop -5626.870389\n", "body-style_hatchback -5735.987637\n", "body-style_sedan -5702.431171\n", "body-style_wagon -5647.437732\n", "drive-wheels_fwd -29.356867\n", "drive-wheels_rwd 1977.187880\n", "wheel-base 318.440516\n", "length -76.626594\n", "width 243.692078\n", "height -335.218743\n", "curb-weight 5.208202\n", "engine-type_l -78263.126984\n", "engine-type_ohc -1913.256121\n", "engine-type_ohcf 51019.479134\n", "engine-type_ohcv -1337.149854\n", "num-of-cylinders_five -4108.064456\n", "num-of-cylinders_four -4688.467080\n", "num-of-cylinders_six -2976.251738\n", "num-of-cylinders_three 73586.269679\n", "engine-size -12.438327\n", "fuel-system_2bbl 2069.509402\n", "fuel-system_idi -78533.441448\n", "fuel-system_mfi 3467.897329\n", "fuel-system_mpfi 2601.780297\n", "fuel-system_spdi 1080.926758\n", "bore -881.685811\n", "stroke -567.659597\n", "compression-ratio -700.029130\n", "horsepower -20.192172\n", "peak-rpm -0.537667\n", "city-mpg -156.388896\n", "highway-mpg 128.416202\n", "dtype: float64" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coefs1 = lm(predictors, \"price\", data)\n", "coefs1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check that your `lm` function produces the same results as scikit-learn." ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "symboling -5.067821\n", "normalized-losses 5.577457\n", "wheel-base 318.440516\n", "length -76.626594\n", "width 243.692078\n", "height -335.218743\n", "curb-weight 5.208202\n", "engine-size -12.438327\n", "bore -881.685811\n", "stroke -567.659597\n", "compression-ratio -700.029130\n", "horsepower -20.192172\n", "peak-rpm -0.537667\n", "city-mpg -156.388896\n", "highway-mpg 128.416202\n", "make_bmw 359.610692\n", "make_chevrolet -4745.387730\n", "make_dodge -6209.191496\n", "make_honda -1582.712461\n", "make_jaguar 2430.797572\n", "make_mazda -4062.659445\n", "make_mercedes-benz 2548.338064\n", "make_mitsubishi -6327.595985\n", "make_nissan -3689.681752\n", "make_peugot -4987.632357\n", "make_plymouth -6024.951373\n", "make_porsche 4830.408327\n", "make_saab -404.038771\n", "make_subaru -3658.347820\n", "make_toyota -5869.407539\n", "make_volkswagen -4297.346425\n", "make_volvo -2871.342061\n", "fuel-type_gas -5323.372967\n", "aspiration_turbo 2171.490389\n", "num-of-doors_two -838.068778\n", "body-style_hardtop -5626.870389\n", "body-style_hatchback -5735.987637\n", "body-style_sedan -5702.431171\n", "body-style_wagon -5647.437732\n", "drive-wheels_fwd -29.356867\n", "drive-wheels_rwd 1977.187880\n", "engine-type_l -4832.244831\n", "engine-type_ohc -1913.256121\n", "engine-type_ohcf -3658.347820\n", "engine-type_ohcv -1337.149854\n", "num-of-cylinders_five -4108.064456\n", "num-of-cylinders_four -4688.467080\n", "num-of-cylinders_six -2976.251738\n", "num-of-cylinders_three 155.387526\n", "fuel-system_2bbl 2069.509402\n", "fuel-system_idi 5323.372967\n", "fuel-system_mfi 3467.897329\n", "fuel-system_mpfi 2601.780297\n", "fuel-system_spdi 1080.926758\n", "Intercept 17767.132374\n", "dtype: float64" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = LinearRegression()\n", "data_expanded = pd.get_dummies(data[predictors], drop_first=True)\n", "model.fit(data_expanded, data[\"price\"])\n", "coefs2 = pd.Series(model.coef_, index=data_expanded.columns)\n", "coefs2[\"Intercept\"] = model.intercept_\n", "coefs2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To debug why the intercepts are different (but most of the coefficients seem correct), we have to compare the coefficients from our `lm` function and scikit-learn. It's pretty hard to eyeball it because there are so many coefficients. Let's join the two to each other." ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
Intercept101623.94678917767.132374
aspiration_turbo2171.4903892171.490389
body-style_hardtop-5626.870389-5626.870389
body-style_hatchback-5735.987637-5735.987637
body-style_sedan-5702.431171-5702.431171
body-style_wagon-5647.437732-5647.437732
bore-881.685811-881.685811
city-mpg-156.388896-156.388896
compression-ratio-700.029130-700.029130
curb-weight5.2082025.208202
drive-wheels_fwd-29.356867-29.356867
drive-wheels_rwd1977.1878801977.187880
engine-size-12.438327-12.438327
engine-type_l-78263.126984-4832.244831
engine-type_ohc-1913.256121-1913.256121
engine-type_ohcf51019.479134-3658.347820
engine-type_ohcv-1337.149854-1337.149854
fuel-system_2bbl2069.5094022069.509402
fuel-system_idi-78533.4414485323.372967
fuel-system_mfi3467.8973293467.897329
fuel-system_mpfi2601.7802972601.780297
fuel-system_spdi1080.9267581080.926758
fuel-type_gas-89180.187383-5323.372967
height-335.218743-335.218743
highway-mpg128.416202128.416202
horsepower-20.192172-20.192172
length-76.626594-76.626594
make_bmw359.610692359.610692
make_chevrolet-4745.387730-4745.387730
make_dodge-6209.191496-6209.191496
make_honda-1582.712461-1582.712461
make_jaguar2430.7975722430.797572
make_mazda-4062.659445-4062.659445
make_mercedes-benz2548.3380642548.338064
make_mitsubishi-6327.595985-6327.595985
make_nissan-3689.681752-3689.681752
make_peugot68443.249796-4987.632357
make_plymouth-6024.951373-6024.951373
make_porsche4830.4083274830.408327
make_saab-404.038771-404.038771
make_subaru-58336.174774-3658.347820
make_toyota-5869.407539-5869.407539
make_volkswagen-4297.346425-4297.346425
make_volvo-2871.342061-2871.342061
normalized-losses5.5774575.577457
num-of-cylinders_five-4108.064456-4108.064456
num-of-cylinders_four-4688.467080-4688.467080
num-of-cylinders_six-2976.251738-2976.251738
num-of-cylinders_three73586.269679155.387526
num-of-doors_two-838.068778-838.068778
peak-rpm-0.537667-0.537667
stroke-567.659597-567.659597
symboling-5.067821-5.067821
wheel-base318.440516318.440516
width243.692078243.692078
\n", "
" ], "text/plain": [ " 0 1\n", "Intercept 101623.946789 17767.132374\n", "aspiration_turbo 2171.490389 2171.490389\n", "body-style_hardtop -5626.870389 -5626.870389\n", "body-style_hatchback -5735.987637 -5735.987637\n", "body-style_sedan -5702.431171 -5702.431171\n", "body-style_wagon -5647.437732 -5647.437732\n", "bore -881.685811 -881.685811\n", "city-mpg -156.388896 -156.388896\n", "compression-ratio -700.029130 -700.029130\n", "curb-weight 5.208202 5.208202\n", "drive-wheels_fwd -29.356867 -29.356867\n", "drive-wheels_rwd 1977.187880 1977.187880\n", "engine-size -12.438327 -12.438327\n", "engine-type_l -78263.126984 -4832.244831\n", "engine-type_ohc -1913.256121 -1913.256121\n", "engine-type_ohcf 51019.479134 -3658.347820\n", "engine-type_ohcv -1337.149854 -1337.149854\n", "fuel-system_2bbl 2069.509402 2069.509402\n", "fuel-system_idi -78533.441448 5323.372967\n", "fuel-system_mfi 3467.897329 3467.897329\n", "fuel-system_mpfi 2601.780297 2601.780297\n", "fuel-system_spdi 1080.926758 1080.926758\n", "fuel-type_gas -89180.187383 -5323.372967\n", "height -335.218743 -335.218743\n", "highway-mpg 128.416202 128.416202\n", "horsepower -20.192172 -20.192172\n", "length -76.626594 -76.626594\n", "make_bmw 359.610692 359.610692\n", "make_chevrolet -4745.387730 -4745.387730\n", "make_dodge -6209.191496 -6209.191496\n", "make_honda -1582.712461 -1582.712461\n", "make_jaguar 2430.797572 2430.797572\n", "make_mazda -4062.659445 -4062.659445\n", "make_mercedes-benz 2548.338064 2548.338064\n", "make_mitsubishi -6327.595985 -6327.595985\n", "make_nissan -3689.681752 -3689.681752\n", "make_peugot 68443.249796 -4987.632357\n", "make_plymouth -6024.951373 -6024.951373\n", "make_porsche 4830.408327 4830.408327\n", "make_saab -404.038771 -404.038771\n", "make_subaru -58336.174774 -3658.347820\n", "make_toyota -5869.407539 -5869.407539\n", "make_volkswagen -4297.346425 -4297.346425\n", "make_volvo -2871.342061 -2871.342061\n", "normalized-losses 5.577457 5.577457\n", "num-of-cylinders_five -4108.064456 -4108.064456\n", "num-of-cylinders_four -4688.467080 -4688.467080\n", "num-of-cylinders_six -2976.251738 -2976.251738\n", "num-of-cylinders_three 73586.269679 155.387526\n", "num-of-doors_two -838.068778 -838.068778\n", "peak-rpm -0.537667 -0.537667\n", "stroke -567.659597 -567.659597\n", "symboling -5.067821 -5.067821\n", "wheel-base 318.440516 318.440516\n", "width 243.692078 243.692078" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([coefs1, coefs2], axis=1)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }